# Clean eMBS security data (leave only pools, and keep only FNM and FHL)
# Also delete balloon MBS, keep only TBA-eligible MBS, high-LTV and jumbo-confidence_levelorming MBS
#  1) keep pool of pools (delete STRIP and REMIC though)
#  2) Up to those issued in Jan 2019


options(echo = TRUE)

library(readr)
library(dplyr)
library(lubridate)
library(stringr)

rootdir <- dirname(getwd())
rawdir <- file.path(rootdir, "data", "scratch_data", "raw_data")
cachedir <- file.path(rootdir, "data", "scratch_data", "cached_data")

embs <- read_csv(file.path(rawdir, "embs.csv"),
                 col_types = "cccdTiTcicciccdii")

embs <- embs %>%
  mutate_at(c("original_wac", "original_wala", "original_wam"),
            function(x) ifelse(x == -999, NA, x)) %>%
  mutate_at(c("issue_date", "mat_date"), funs(as.Date(.)))
cleaned <- embs %>%
  filter(embs_productsuptype %in% c("POOL", "TBA") & agency_id %in% c("FHL", "FNM"))

print(nrow(cleaned))

# Keep pool of pools as well
# Keep up to those issued in Jan 2019
cleaned <- cleaned %>%
  filter(embs_productsuptype == "TBA" | issue_date <= as.Date("2019-01-01")) %>%
  filter(collateral_type %in% c("LOAN", "POOL") | embs_productsuptype == "TBA") %>%
  mutate(tbaeligcode_id = case_when(embs_productsuptype == "TBA" ~ as.character(NA),
                                 tbaeligcode_id == "Y" ~ "Y",
                                 tbaeligcode_id == "#" ~ as.character(NA),
                                 tbaeligcode_id == "N" ~ "N",
                                 TRUE ~ tbaeligcode_id))
print(nrow(cleaned))

# If there are any cusip_ids with multiple obs, choose the one with later security_id
# Seems to be the correct one
# Relatively few are deleted here
cleaned <- cleaned %>%
  group_by(cusip_id) %>%
  arrange(security_id) %>%
  filter(row_number() == n()) %>%
  ungroup()
print(nrow(cleaned))

# Check how many are pool of pools
print("First-level vs second-level securities")
cleaned %>%
  filter(embs_productsuptype != "TBA") %>%
  group_by(collateral_type) %>%
  summarize(count = n()) %>%
  ungroup() %>%
  print()

# Take out balloon MBS, and leave only TBA, TBA-eligible, high-LTV, and Jumbo
cleaned <- cleaned %>%
  filter(str_sub(embs_product, start = -4) != "BALL") %>%
  mutate(type = case_when(
                    embs_productsuptype == "TBA" ~ "TBA",
                    str_sub(embs_product, start = -2) == "JM" ~ "Jumbo",
                    str_sub(embs_product, start = -5) == "HILTV" ~ "High_LTV",
                    tbaeligcode_id == "Y" ~ "TBA_eligible",
                    TRUE ~ "Other_TBA_ineligible")) %>%
  filter(type != "Other_TBA_ineligible") %>%
  filter(str_sub(embs_product, start = -6) != "INIOJM") %>%
  mutate(bond_term_yr = as.numeric(str_extract(embs_product, "[[:digit:]]+")))

print("List of unique embs_products")
print(unique(cleaned$embs_product))

# There are only 3 cusip_ids that have issue_date that are not on the 1st of the month
# All are issued in 1980s, so can safely ignore
tmp <- cleaned %>% filter(day(issue_date) > 1) 
print(tmp)


saveRDS(cleaned, file.path(cachedir, "embs_cleaned_withpoolsq.RDS"))


